Imports

In [1]:
import pickle
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pyLDAvis.sklearn
import json
import glob
import os
import pprint
import re
import string
import nltk
import warnings
import networkx as nx
import matplotlib.pyplot as plt
from langdetect import detect
warnings.filterwarnings("ignore",category=DeprecationWarning)

Helper Function - Sweep Tweets

In [2]:
def sweep_tweets(tweets):
    clean_tweets = []
    for tweet in tweets:
        tweet = re.sub(r'[.,"!]+', '', tweet, flags=re.MULTILINE)  # removes the characters specified
        tweet = re.sub(r'^RT[\s]+', '', tweet, flags=re.MULTILINE)  # removes RT
        tweet = re.sub(r'^rt[\s]+', '', tweet, flags=re.MULTILINE)  # removes RT
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)  # remove link https
        tweet = re.sub(r'http?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)  # remove link http
        tweet = re.sub(r'[:]+', '', tweet, flags=re.MULTILINE)
        tweet=tweet.replace('&amp','').replace(';amp;','')
        tweet = ''.join(filter(lambda x: x in string.printable, tweet))  # filter non-ascii characers

        clean_tweet = ''
        for i in tweet.split():  # remove @words, punctuataion
            if not i.startswith('@') and i not in string.punctuation:
                clean_tweet += i + ' '
        tweet = clean_tweet
        # Do sentence correction
        
        try:
            if detect(tweet) != 'en':
                continue
        except:
            pass
        
        if tweet in clean_tweets:
            continue
        else:
            clean_tweets.append(tweet)
    return clean_tweets

Helper Function - Plot Users and Topics

In [3]:
def convert_to_hex(rgba_color) :
    red = int(rgba_color[0]*255)
    green = int(rgba_color[1]*255)
    blue = int(rgba_color[2]*255)
    return '#%02x%02x%02x' % (red, green, blue)


def plot_users_and_topics(G):
    degree_max = max([G.degree(n) for n, d in G.nodes(data=True) if n and d['type'] == 'user'])
    cmap = plt.cm.summer
    norm = plt.Normalize(vmin=1, vmax=degree_max)
    m = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
    m._A = []

    post_colors = []
    user_colors = []
    topics = []
    users = []

    node_type = nx.get_node_attributes(G, 'type')
    for node, node_type in node_type.items():
        if node_type == 'topic':
            post_colors.append('red')
            topics.append(node)
        else:
            user_colors.append(convert_to_hex(m.to_rgba(len(G.edges(node)))))
            users.append(node)
    plt.figure(1, figsize=(25,15), dpi=400)
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, nodelist=topics, node_color=post_colors, with_labels=True, node_shape='^')
    label_dict = {}
    for topic in topics:
        label_dict[topic] = str(topic)
    for user in users:
        label_dict[user] = ''
    nx.draw_networkx_labels(G, pos, labels=label_dict)
    nx.draw_networkx_nodes(G, pos, nodelist=users, node_color=user_colors, with_labels=False, node_shape='o', node_size=100)
    nx.draw_networkx_edges(G, pos)
    plt.colorbar(m)
    plt.title('[ Triangle: Topic, Circle: User (color-by-degree) ]')
    plt.show()

Read Data

In [15]:
BASE_DIR = os.curdir
TWEETS_DIRS = glob.glob(os.path.join(BASE_DIR, 'tweets_*'))
TOPICS = [os.path.split(folder)[1].replace('tweets_', '') for folder in TWEETS_DIRS]
USER_DATA = {}

total_tweets = 0
if os.path.exists(os.path.join(BASE_DIR, 'user_data.pkl')):
    with open(os.path.join(BASE_DIR, 'user_data.pkl'), 'rb') as f:
        USER_DATA = pickle.load(f)
        total_tweets = sum([len(value) for key, value in USER_DATA.items()])
else:
    for tweet_dir in TWEETS_DIRS:
        for json_file in glob.glob(os.path.join(tweet_dir, '*.json')):
            user_data = json.load(open(json_file, 'r'))
            user_id = user_data['user']
            user_tweets = user_data['tweets']
            if user_id not in USER_DATA.keys():
                USER_DATA[user_id] = []
            clean_user_tweets = sweep_tweets(user_tweets)
            USER_DATA[user_id].extend(clean_user_tweets)
            total_tweets += len(clean_user_tweets)

print('--- Data Ingest Summary ---')
print('Users: ' + str(len(USER_DATA.keys())))
print('Tweets: ' + str(total_tweets))
print('Topics: ' + ', '.join(TOPICS))
--- Data Ingest Summary ---
Users: 256
Tweets: 483312
Topics: antivax, illuminati, extraterrestrials, flatearther, haarp, chemtrails, vaccines, conspiracy, pizzagate, hollowearth
In [13]:
with open(os.path.join(BASE_DIR, 'user_data.pkl'), 'wb') as f:
    pickle.dump(USER_DATA, f)

LDA and NMF Setup

In [5]:
# Maximum number of top words to find in the dataset
MAX_FEATURES = 1000

# Max number of topics
N_COMPONENTS = 50
In [6]:
CHEMTRAILS_DIR = os.path.join(BASE_DIR, 'tweets_chemtrails')
CHEMTRAILS_DATA = [json.load(open(json_file, 'r')) for json_file in glob.glob(os.path.join(CHEMTRAILS_DIR, '*.json'))]
CHEMTRAILS_TWEETS = sweep_tweets([tweet for user in CHEMTRAILS_DATA for tweet in user['tweets']])

LDA - Chemtrails Only

In [7]:
chemtrails_tf_vectorizer_lda = CountVectorizer(max_df=0.95, min_df=2, max_features=MAX_FEATURES, stop_words='english')
chemtrails_tf_lda = chemtrails_tf_vectorizer_lda.fit_transform(CHEMTRAILS_TWEETS)

chemtrails_lda = LatentDirichletAllocation(n_components=N_COMPONENTS, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(chemtrails_tf_lda)
chemtrails_topic_vis_lda = pyLDAvis.sklearn.prepare(chemtrails_lda, chemtrails_tf_lda, chemtrails_tf_vectorizer_lda)
pyLDAvis.display(chemtrails_topic_vis_lda)
Out[7]:

NMF - Chemtrails Only

In [8]:
chemtrails_tfidf_vectorizer_nmf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=MAX_FEATURES, stop_words='english')
chemtrails_tfidf_nmf = chemtrails_tfidf_vectorizer_nmf.fit_transform(CHEMTRAILS_TWEETS)
chemtrails_tfidf_feature_names_nmf = chemtrails_tfidf_vectorizer_nmf.get_feature_names()

chemtrails_nmf = NMF(n_components=N_COMPONENTS, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(chemtrails_tfidf_nmf)

def display_topics(model, feature_names, no_top_words):
    print()
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(chemtrails_nmf, chemtrails_tfidf_feature_names_nmf, 30)
Topic 0:
people black american white government say believe vote doing truth country use lot said actually power sick young protect really things media dont understand hate million fact away thing money
Topic 1:
new week link order article aircraft service family city tomorrow car book report shows post airpollution airport song followers home 2018 study gold breaking read flight big president released march
Topic 2:
geoengineering lookup wakeup fukushima spraying 17 haarp speakup opchemtrails srm climatechange planet chemicals sun sprayed skies spray 2017 nuclear toxic aerosol clouds weathermodification solar daily 18 aluminum engineering killing climate
Topic 3:
trump president donald maga obama russia mueller hillary america fbi russian says clinton campaign war vote said media comey anti democrats breaking state country national office house retweet michael american
Topic 4:
pollution environment air climatechange clean plastic epa climate cnn oceans environmental shared levels cities waste toxic ocean study health china uk breathe dangerous problem major science cut earth million companies
Topic 5:
just thought bad said say things isn little mean oh wrong leave looking doing case away really wait makes use bought clear im guys better saying probably shit yeah hard
Topic 6:
like looks sounds feel really does retweet doesn real guy shit little dont lot rt looked america things hope left stuff follow tweet bad isn thing true sound free big
Topic 7:
day beautiful bad tomorrow earth spray hope game sun morning wonderful week single friends having spraying white avgeek sky start april released enjoy yesterday days thought far night school pic
Topic 8:
delhi airpollution air quality pm25 vs ist knowyourair worse london nyc sydney beijing low 25 matter lt breathe force better levels uk breathing children study clean raf city worst flight
Topic 9:
time long tell night end play little matter high big 1st start damn state uk maybe wake sun head real soon second read having mueller say media liberal feel live
Topic 10:
know didn doesn dont doing does qanon knows lot used trying lol exactly sure retweet better truth say hell stuff guy wouldn work feeling gonna happen america knew thats history
Topic 11:
good morning luck thing night friends bad evening job work really isn idea start things hope doing point feel looking yes guy friday guys say dear weekend stuff game view
Topic 12:
today ago yesterday years service saw spraying uk thing work flight london long flying heavy beautiful view sky took photo friend skies march im home feeling city seen high guy
Topic 13:
health 5g depopulation radiation dews human technology smart frequency truth ghz humanity emr waves energy directed frequencies airpollution global public high microwaves yes electromagnetic nwo vaccines microwave earth harm cancer
Topic 14:
don care believe forget say sorry tell understand agree oh use follow stupid mean trust probably remember lol truth russian won women end happen feel wrong doesn stand black read
Topic 15:
plasticpollution plastic plasticfree waste use environment oceans ocean sea uk reduce single problem companies using used end cut clean ice paper planet free 40 come ban sign trying issue bit
Topic 16:
love peace night beautiful arlena art family beauty woman picture country game amazing animals hate followers god guy live weekend care song oh gonna seeing truly absolutely agree thing read
Topic 17:
right left far wrong head country thing oh vote im sorry money david really maga truth run weathermodification protect power american yeah guy big saying house chicago kind white mean
Topic 18:
think really didn funny better does yes normal actually doesn wrong children gonna im gun trying making stupid heard far wants school thinking dont come makes having bad things said
Topic 19:
chemtrails opchemtrails depopulation sky haarp sprayed spraying weathermodification climatechange skies 2018 srm weatherwarfare conspiracy real daily toxic sun morning barium nwo planes tv cloud qanon electromagnetic spray waves nano chemtrail
Topic 20:
got finally bit black big ive hope lol yeah work tried 10 guy left sorry baby shit said check away hours yesterday try harrier tweet blue car told phone friend
Topic 21:
face feel head come conspiracy work shows bring really political rest facebook say dog heart white having question said shot british piece justice goes mean cat shut showing low warning
Topic 22:
need really dont start case hey protect work needs guys country airpollution government actually person says doesn support gun better check does vote thing cancer proof stand turn said energy
Topic 23:
world order war live control david nwo smart spraying biggest yes electromagnetic free population amazing end radiation wonderful america food report aircraft children needs 10 history problem peace money global
Topic 24:
did say obama didn said job miss fbi vote actually use really pay start tell wow money question fucking thing hours comey free ask tweet happen hillary wife land learn
Topic 25:
great work job america doing making hope weekend idea read avgeek really looks friends britain nation team article getting hear american film fun stuff lot flying friend hate say answer
Topic 26:
thank god bless service support lol rt arlena tweet country welcome job wonderful christmas family kind heart fight hard amazing usa following friend beautiful truly oh stand huge words president
Topic 27:
happy birthday friends dear weekend friday beautiful wish morning week year wonderful friend hello im hope followers evening dog days earth makes hear guys melania bless lol check general night
Topic 28:
going youre tomorrow im lot forward week long miss yrs said getting money 2018 mccabe days really win pay trying oh break happen hope media girl sad silver sure war
Topic 29:
news fake breaking fox media latest global cnn alert watch report daily local msm 2018 says story read california real truth tv bad lying jobs comey 2017 minutes syria election
Topic 30:
thanks ok problem retweet rt share lol sure helping weekend hours lot live ive check info kind dear trying big follow thought read looking air god beautiful open really making
Topic 31:
look sky blue does wake outside forward clouds inside beautiful oh spraying check normal eyes white really exactly sprayed sun chemicals cloud aren real state movie tell happening sick skies
Topic 32:
stop spraying non won chemicals killing ban wake wakeup children start planet using nwo calling crime happening needs sign stupid giving damage shit kids poison playing spray violence matter making
Topic 33:
way better goes used half home win work wish hard follow kids knows actually didn wrong obama power hillary dont looking works long cool use said school bad doesn american
Topic 34:
nice goal hello really meet try red makes view clouds lol blue finally night place damn play hear said sent things avgeek start fucking tweets fun energy board arlena end
Topic 35:
ve seen heard car thing years little thought read getting lost said tried home say used things waiting told long oh away taken guys actually bought asked sure decades makes
Topic 36:
man black white woman house say police called young old oh said shot little men women free family america muslim bernie guy gun girl says power trying obama vote retweet
Topic 37:
twitter account tweet followers tweets facebook follow read week night thing friends following use signed stopped report political social test trying sent hey google truth page morning doesn gets ai
Topic 38:
want really dont say come work stay christmas read democrats americans free gun doesn control hear america buy book fight does fly makes support children didn money talk true country
Topic 39:
ll send buy sure money maybe leave soon wait tomorrow tell lying use things said bet say theyre little doing fine google yeah set stay tweet remember days home thing
Topic 40:
make sure better does im work difference money america free tell things feel trying lets sense use thing green pretty gonna high truth deal wants hell lot clear public happen
Topic 41:
video watch youtube share live says shows check tv 2018 follow david didn greatawakening white showing 2017 obama truth days listen air qanon music sent breaking interview rt makes science
Topic 42:
post published new report work night heavy daily snow office 2017 high facebook rt gold photos page old fun said weekend 11 run flight washington secret goes welcome bad account
Topic 43:
year old years ago 10 20 50 13 school 15 girl far shot million 16 25 12 say photo obama 11 days police remember away 30 took billion end 2018
Topic 44:
weather climate change modification control warfare climatechange global engineering haarp lookup cnn real sky jfk sprayed wakeup epa blue speech clouds natural used planet speakup weatherwarfare srm human california earth
Topic 45:
water food rain toxic problem air test clean chemicals aluminum use shared barium chemical lead california natural waste end check poison oil reduce eat law radiation epa skies money amazing
Topic 46:
help opchemtrails raise needs better reduce support young campaign plasticpollution money times children fight gt spread save info retweet srm rt school understand share things outside breathing police national sure
Topic 47:
life real death god family save animals better earth 12 live artist art story rest home person entire planet years change care oil children doesn country minutes thought child lives
Topic 48:
best thing worst friends doing seen luck internet known weekend wish probably family maga home buy season birthday far trying does british food friend goal old silver public easy follow
Topic 49:
let hope talk forget dont say country work enjoy open game start pray remember ok walk win play continue end okay tell vote ill won change maga clear hear shit

Graph Analysis Setup

In [9]:
G = nx.DiGraph()

for topic in TOPICS:
    G.add_node(topic, type='topic', tweets=[])
    
for user_id, user_tweets in USER_DATA.items():
    G.add_node(user_id, type='user')
    topic_not_found = True
    for tweet in user_tweets:
        for topic in TOPICS:
            if topic in tweet.lower():
                G.add_edge(user_id, topic)
                G.node[topic]['tweets'].append(tweet)
                topic_not_found = False
    if topic_not_found: # Remove nodes with no edges (most-likely filtered due to being non-english)
        G.remove_node(user_id)

Graph Analysis - Users to Topics

In [10]:
plot_users_and_topics(G)

Graph Analysis - Most Mentioned

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from collections import namedtuple


n_groups = len(TOPICS)

avg_mentions = [len(G.node[topic]['tweets'])/len(glob.glob(os.path.join(BASE_DIR, 'tweets_'+topic, '*.json'))) for topic in TOPICS]

sorted_mentions, sorted_topics = zip(*sorted(zip(avg_mentions, TOPICS), reverse=True))

fig, ax = plt.subplots(figsize=(25,15), dpi=400)

index = np.arange(n_groups)
bar_width = 0.35

opacity = 1

rects1 = ax.bar(index, sorted_mentions, bar_width,
                alpha=opacity, color='r',
                label='Topics')


ax.set_xlabel('Topics', fontsize=16)
ax.set_ylabel('Average # of user mentions', fontsize=16)
ax.set_title('Average # of times topic has been mentioned by users', fontsize=18)
ax.set_xticks(index)
ax.set_xticklabels(sorted_topics, fontsize=14)

plt.show()

LDA - All User Tweets

In [12]:
ALL_USER_TWEETS = []
for user_id, user_tweets in USER_DATA.items():
    ALL_USER_TWEETS.extend(user_tweets)

all_user_tf_vectorizer_lda = CountVectorizer(max_df=0.95, min_df=2, max_features=MAX_FEATURES, stop_words='english')
all_user_tf_lda = all_user_tf_vectorizer_lda.fit_transform(ALL_USER_TWEETS)

all_user_lda = LatentDirichletAllocation(n_components=N_COMPONENTS, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(all_user_tf_lda)
all_user_topic_vis_lda = pyLDAvis.sklearn.prepare(all_user_lda, all_user_tf_lda, all_user_tf_vectorizer_lda)
pyLDAvis.display(all_user_topic_vis_lda)
Out[12]: